import math
from pathlib import Path

import pandas as pd
import statsmodels.formula.api as smf


def zscore(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    m = s.mean()
    sd = s.std(ddof=0)
    if not (sd and sd > 0):
        return s * 0
    return (s - m) / sd


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    ess_path = root / "outputs" / "ess_r8_r11_min.parquet"
    bb_path = root / "data_external" / "broadband_country_year.csv"
    out_path = root / "outputs" / "quick_first_stage.txt"

    ess = pd.read_parquet(ess_path)
    bb = pd.read_csv(bb_path)
    bb["year"] = pd.to_numeric(bb["year"], errors="coerce").astype("Int64")

    merged = ess.merge(bb, left_on=["cntry", "survey_year"], right_on=["cntry", "year"], how="left")

    # Simple composite instrument (country-year): average of standardized fixed broadband and internet users.
    merged["z_fixed_bb"] = zscore(merged["fixed_broadband_per100"])
    merged["z_inet_users"] = zscore(merged["internet_users_pct"])
    merged["z_infra_index"] = (merged["z_fixed_bb"] + merged["z_inet_users"]) / 2

    # Minimal cleaning for first stage checks
    merged["netusoft_num"] = pd.to_numeric(merged.get("netusoft"), errors="coerce")
    merged["netustm_num"] = pd.to_numeric(merged.get("netustm"), errors="coerce")
    merged["log_netustm"] = (merged["netustm_num"].clip(lower=0)).map(lambda x: math.log1p(x) if pd.notna(x) else x)

    lines = []
    lines.append(f"Rows (ESS merged): {len(merged):,}")
    lines.append(f"Countries: {merged['cntry'].nunique(dropna=True)}")
    lines.append(f"Survey years: {sorted(int(x) for x in merged['survey_year'].dropna().unique())}")
    lines.append("")

    for dep, dep_label in [("netusoft_num", "netusoft (ordinal numeric)"), ("log_netustm", "log(1+netustm)")]:
        df = merged[[dep, "z_infra_index", "cntry", "survey_year"]].dropna()
        if df.empty:
            lines.append(f"[SKIP] {dep_label}: no data after dropna")
            continue
        # Country FE + survey-year FE, clustered by country.
        model = smf.ols(f"{dep} ~ z_infra_index + C(cntry) + C(survey_year)", data=df).fit(
            cov_type="cluster", cov_kwds={"groups": df["cntry"]}
        )
        coef = model.params.get("z_infra_index", float("nan"))
        se = model.bse.get("z_infra_index", float("nan"))
        t = coef / se if se and not math.isnan(se) else float("nan")
        lines.append(f"[First stage] {dep_label}: coef(z_infra_index)={coef:.4f}, se={se:.4f}, t={t:.2f}, n={len(df):,}")

    out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    print(f"Wrote: {out_path}")


if __name__ == "__main__":
    main()

